set_model_config_manually
set_layernum_manually
num_nodes = 1 == nnode_nodes
num_gpus_per_node = 8
settle_chunk =
settle_bsz = global_batch_size
# 不要 bsz_scale=8

epoch
pipeline_model_parallel_size
tensor_model_parallel_size
virtual_pipeline_model_parallel_size
gpus_per_stage
nstages_per_node
micro_batch_size --> 计算出
tensor_length = sequence_length
hidden_size
intermediate_size

num_hidden_layers =
num_attention_heads =
vocab_size =